/**********************************************************************/
/*
   Author: Michelle Han
   Created: 24 August 2022
   Updated: Apr 2024
   Description: Cleans raw August 2021 sakernas data.

   This cleaning file should output 2 different datasets:
   1. Cleaned SAKERNAS data:
   sak_aug21_deid_clean
   2. Subset of SAKERNAS person-batch data matched with PMO for SAKERNAS analysis:
   sak_aug21_deid_clean_merged
*/
/**********************************************************************/

*******************************************
* Setup

* include filepaths 
  if "$master_run" !="1" include "./Do/SET_FILEPATHS.do"
  * Log
  cap log close
  global prefix: display %tdCYND td(`c(current_date)')
  log using "$KP_logs/${prefix}_clean_SAKERNAS_aug2021.txt", text replace

  clear
  set more off

  u "$KP_deid_sakernas/Raw/SAKERNAS_PRAKERJA_21AUG_deid.dta", clear
  rename final_weig weight

/*----------------------------------------------------*/
              /* Section: Demographics */
/*----------------------------------------------------*/

* create HH id
  egen hh_id = group(psu no_dsrt)
  summ hh_id
  rename hh_id hh_id_s // got this from Nikhil's code, not sure why
  di `r(N)' / `r(max)'

* birth year
  gen year_dob_sak = k5_th
  tab year_dob_sak
  replace year_dob_sak = . if year_dob_sak == 9999

* birth month
  gen month_dob_sak = k5_bln
  tab month_dob_sak
  replace month_dob_sak = . if month_dob_sak == 99

* age
  summ k6
  gen age_sak = k6
  recode age_sak (0/20 = 1) (21/40 = 2) (41/60 = 3) (61/80 = 4) (81/100 = 5), gen(age_cat_sak)

* Dummy for age <=30 
	gen young = 0 
	replace young = 1 if age_sak <= 30
	la def young 0 "Over 30 Years Old" 1 "30 and Under"
	la val young young 

* number of HH members
  gen hh_size_sak = jart
  gen hh_size_sak_5 = jart-jart5

* relationship to HH head
  tab k3 , m
  gen relation_hh_head = k3
  la def relations 1 "HH head" 2 "Spouse" 3 "Son/Daughter" 4 "Step/adopted child" 5 "Son/Daughter-in-law" 6 "Grandchild" 7 "Parent/Parent-in-law" 8 "Other family" 9 "Housemaid" 10 "Driver/Gardener" 11 "No relation", replace
  la val relation_hh_head relations
  tab relation_hh_head

* order in roster
  isid urutan
  bysort hh_id_s (urutan): gen roster_first = _n == 1
  tab relation_hh_head roster_first

* marital status
  tab r4  , m
  gen married = r4 == 2
  gen divorced = r4 == 3
  gen widowed = r4 == 4
  gen single = r4 == 1

* gender
  tab k4 , m
  gen female = k4 == 2

* gender (version 2)
  gen gender = k4 == 1
  label def gender 1 "male"  0 "female"
  label val gender gender

* current student
  tab r5, m
  gen current_student = r5 == 2

* education levels
  tab r6a, m
  gen no_elementary = r6a == 1
  gen elementary = r6a == 2
  gen junior_high = r6a == 3
  gen high_school = r6a == 4 | r6a == 5
  gen tertiary = 6 <= r6a & r6a <= 8
  summ no_elementary elementary junior_high high_school tertiary

* education -> years of schooling
  gen educ = r6a
  gen school_years = 3 if educ == 1 // No elementary
  replace school_years = 6 if inlist(educ, 2) // elementary
  replace school_years = 9 if inlist(educ, 3) // junior high
  replace school_years = 12 if inlist(educ, 4, 5) // high (8-11)
  replace school_years = 14 if educ == 6 // Diploma I/II/III
  replace school_years = 16 if educ == 7 // Diploma IV
  replace school_years = 18 if educ == 8 // S1/S2/S3

* Dummy for educated (graduating high school)
  gen educated = school_years > 12 
  la def educated 0 "Less than 12 Years Schooling" 1 "12 or More Years of Schooling"
  la val educated educated
  
* training courses: ever done a training course with certificate?
  tab r6e, m
  gen train_certif = r6e == 1

* training courses: taken a course in the last 3 years
  tab r6d, m
  tab r6g, m
  gen training_3yrs = r6d == 1 & r6g == 1

* live in the same kabupaten you were born in?
  gen born_kab = r7b_kab // use kabupaten where person lived 5 years ago
  gen born_live_same = born_kab == kode_kab

* Prov 5 yrs ago
  gen born_prov = r7b_prov

* migration
  tab r7b, m
  gen migrated = r7b == 2 | r7b == 3

* Migration. Place of residence now (prov/kab) != place of residence in 2015
  gen kab5 = r7b_kab // use kabupaten where person lived 5 years ago
  gen migration = kode_kab != kab5

* disabilities
  tab r8a, m
  gen vision_disabled = r8a == 2 | r8a == 3
  gen vision_disability = r8a

  tab r8b, m
  gen hearing_disabled = r8b == 2 | r8b == 3
  gen hearing_disability = r8b

  tab r8c, m
  gen walk_disabled = r8c == 2 | r8c == 3
  gen walk_disability = r8c

  tab r8d, m
  gen hand_disabled = r8d == 2 | r8d == 3
  gen hand_disability = r8d

  tab r8e, m
  gen speech_disabled = r8e == 2 | r8e == 3
  gen speech_disability = r8e

  tab r8f, m
  gen other_disabled = r8f == 2 | r8f == 3
  gen other_disability = r8f
  summ *disabled

  gen any_disability = (vision_disabled + hearing_disabled + walk_disabled + hand_disabled + speech_disabled + other_disabled) >= 1
  tab any_disability

  gen severe_disability = r8a == 3 | r8b == 3 | r8c == 3 | r8d == 3 | r8e == 3 | r8f == 3
  tab severe_disability


* urban-rural
  tab klas, m
  gen urban_sak = klas == 1

* alternative urban/rural variable, check
  gen city_sak = kode_kab/10  > =7  & kode_kab/10 < 8
  gen city_sak_5 = born_kab/10  > =7  & born_kab/10 < 8 if !mi(born_kab)

* java
  tab kode_prov
  gen java_sak = inrange(kode_prov, 31, 36)

* java 5 years ago
  tab born_prov
  gen java_sak_5 = inrange(born_prov, 31, 36)

/*----------------------------------------------------*/
                /* Section: Employment */
/*----------------------------------------------------*/

* employed
  * (worked at least 1 hour in past week)
  tab r9a, m
  gen employed = r9a == 1

  * did any income-generating activity
  tab r9b, m
  replace employed = 1 if r9b == 1

  * helped with work
  tab r9c, m
  replace employed = 1 if r9c == 1

  * temporarily not working
  tab r10a, m
  replace employed = 1 if r10a == 1
  tab employed, m

* employment status
  tab r12a, m
  gen employment_status = r12a
  label define status 0 "Not Working" 1 "Self-employed" 2 "Business owner with temporary/unpaid workers" 3 "Business owner with paid workers" 4 "Employee" 5 "Temporary worker in agriculture" 6 "Temporary worker (non-agriculture)" 7 "Family/unpaid worker", replace
  la val employment_status status
  tab employment_status, m

* Type of work: the categories including permanent job, self employed, business owner, etc (12A)
  gen self_employed = employment_status == 1
  gen business_owner = inlist(employment_status, 2, 3)
  gen perm_worker = employment_status == 4
  gen temp_worker = inlist(employment_status, 5, 6)
  gen family_unpaid = employment_status == 7
  gen self_emp_bus_owner = inlist(employment_status,1,2,3)

  * temporarily not working
  gen temp_not_work = r10a
  replace temp_not_work = 0 if temp_not_work!=1

* were you employed last week?
  replace employed = inlist(employment_status, 1, 2, 3, 4, 5, 6)
  tab employed, m
  tab employed employment_status, m row

* Business field
  // 17-sector classification: (https://www.bps.go.id/statictable/2016/01/06/1898/klasifikasi-17-sektor-tabel-input-output-indonesia-2010.html)
  tab kbli2020_1, m
  la def business_fields 1 "Agriculture, Forestry and Fisheries" 2 "Mining and excavation" 3 "Industrial Production" 4 "Electricity and Gas" ///
    5 "Water Supply, Waste Management, and Recycling" 6 "Construction" 7 "Wholesale and Retail Trade, Car and Motorcycle Repair" ///
    8 "Transportation and Warehousing" 9 "Hospitality and Restaurants" 10 "Infomation and Communication" 11 "Financial Services and Insurance" ///
    12 "Real Estate" 13 "Business Services" 14 "Government Administration, Defense and Social Security" 15 "Education" 16 "Health and Social Services" 17 "Other", replace
  gen business_field = kbli2020_1 if employed == 1
  la val business_field business_fields
  tab business_field, m

* KBJI code (https://www.bps.go.id/website/fileMenu/KBJI-2014.pdf)
  // attempt to create two-digit KBJI code
  tab kbji2014, m
  tab kbji2014, m
  la def occupations 0 "Army and Police" 1 "Manager" 2 "Professional" 3 "Technicians and Assistant Professionals" 4 "Administrative Personnel" 5 "Business Services and Sales Personnel" 6 "Skilled Workers in Agriculture, Forestry and Fisheries" 7 "Production, Craft, and Related Workers" 8 "Machine Operators and Assembly Workers" 9 "Blue-collar workers", replace
  gen occupation = kbji2014 if employed == 1
  la val occupation occupations
  tab occupation, m


* wage last month (assuming this is July)
  summ r14a_uang r14a2_brg if inlist(employment_status, 1, 4, 5, 6)
  tab r14a_uang if r14a_uang < 0

* Earnings not including business ownership
  gen earnings = r14a_uang + r14a2_brg if inlist(employment_status, 1, 4, 5, 6)
  summ earnings if inlist(employment_status, 1, 4, 5, 6), d
  count if mi(earnings)

* Individual earnings, including zero if not employed
  gen ind_earnings = earnings
  replace ind_earnings = 0 if employed == 0
  sum ind_earnings, d

  gen ind_adj_earnings = ind_earnings
  sum ind_adj_earnings if ind_adj_earnings > 0, d
  replace ind_adj_earnings = r(p99) if ind_adj_earnings > r(p99) & !mi(ind_adj_earnings)
  replace ind_adj_earnings = r(p1) if ind_adj_earnings < r(p1) & !mi(ind_adj_earnings) & ind_adj_earnings != 0
  replace ind_adj_earnings = ind_adj_earnings/1000
  la var ind_adj_earnings "Adjusted Earnings (in 1000s of rupiah, trimmed of 1st and 99th pct)"  
  sum ind_adj_earnings, d
  gen ind_pos_adj_earnings = ind_adj_earnings if ind_adj_earnings > 0

  gen ind_earnings_pos = ind_earnings > 0 if !mi(ind_earnings)

* hours worked last week
  summ r16a1_jml if inlist(employment_status, 1, 4, 5, 6)
  summ r16a1_jml
  count if r16a1_jml > (7*14)
  hist r16a1_jml
  gen hours_worked_raw = r16a1_jml
  summ hours_worked_raw if hours_worked_raw != 0

* hours worked pre-covid
  summ r16b2 if inlist(employment_status, 1, 4, 5, 6)
  summ r16b2
  count if r16b2 > (7*14)
  hist r16b2
  gen hours_worked_raw_precovid = r16b2
  summ hours_worked_raw_precovid if hours_worked_raw_precovid != 0

* hours worked, truncated at 14 hours per day
* NOTE FROM MICHELLE: Check if the other SAKERNAS truncate hours worked for consistency
  gen hours_worked = hours_worked_raw
  replace hours_worked = 14*7 if hours_worked_raw > 14*7 & hours_worked_raw != .
  replace hours_worked = 0 if employed == 0
  summ hours_worked_raw hours_worked

  * hours worked, truncated at 14 hours per day
* NOTE FROM MICHELLE: Check if the other SAKERNAS truncate hours worked for consistency
  gen hours_worked_precovid = hours_worked_raw_precovid
  replace hours_worked_precovid = 14*7 if hours_worked_raw_precovid > 14*7 & hours_worked_raw_precovid != .
  replace hours_worked_precovid = 0 if employed == 0
  summ hours_worked_raw_precovid hours_worked_precovid

* hourly wage (income per day / hours per day)
* raw wages and hours
  gen hourly_wage_raw = (earnings / 31) / (hours_worked_raw / 7)
  summ hourly_wage_raw, d

* Hourly wage, including business owners
* Income trimmed to 99th percentile
  gen hourly_wage = (ind_adj_earnings/31)/(hours_worked / 7) if hours_worked != 0
  replace hourly_wage = 0 if hours_worked == 0
  gen poswage = hourly_wage if hourly_wage > 0

* Winsorize 1% and 99% percentiles
  gen hourly_adj_wage = hourly_wage
  sum hourly_adj_wage if hourly_adj_wage > 0, d
  replace hourly_adj_wage = r(p99) if hourly_adj_wage > r(p99) & !mi(hourly_adj_wage)
  replace hourly_adj_wage = r(p1) if hourly_adj_wage < r(p1) & !mi(hourly_adj_wage) & hourly_adj_wage != 0
  la var hourly_adj_wage "Adjusted Hourly Wage (in 1000s of rupiah, winsorized of 1st and 99th pct)"
  sum hourly_adj_wage, d

* Positive wages
  gen poswage_adj = hourly_adj_wage if hourly_adj_wage > 0
  gen poswage_dummy = hourly_adj_wage > 0 if !mi(hourly_adj_wage)

* Replace wages as missing if no hours are worked
  gen hourly_wage_no0 = hourly_wage
  replace hourly_wage_no0 = . if hours_worked == 0

* Positive wages
  gen poswage_no0 = hourly_wage_no0 if hourly_wage_no0 > 0

* Winsorize 1% and 99% percentiles
  gen hourly_adj_wage_no0 = hourly_wage_no0
  sum hourly_adj_wage_no0 if hourly_adj_wage_no0 > 0, d
  replace hourly_adj_wage_no0 = r(p99) if hourly_adj_wage_no0 > r(p99) & !mi(hourly_adj_wage_no0)
  replace hourly_adj_wage_no0 = r(p1) if hourly_adj_wage_no0 < r(p1) & !mi(hourly_adj_wage_no0)
  replace hourly_adj_wage_no0 = hourly_adj_wage_no0/1000 
  la var hourly_adj_wage_no0 "Adjusted Hourly Wage (no zeroes, in 1000s of rupiah, winsorized of 1st and 99th pct)"  
  sum hourly_adj_wage_no0, d
  gen poswage_adj_no0 = hourly_adj_wage_no0 if hourly_adj_wage_no0 > 0

* Household earnings
  bys hh_id_s: egen mi_ind_earnings = max(mi(ind_earnings))
  bys hh_id_s: egen hh_earnings = total(ind_earnings) if mi_ind_earnings ==0
	gen hh_pos_earnings = hh_earnings if hh_earnings > 0

	gen hh_adj_earnings = hh_earnings // no adjustment here?
  replace hh_adj_earnings = hh_adj_earnings / 1000 
  la var hh_adj_earnings "Adjusted Hourly Wage (in 1000s of rupiah)"  
	sum hh_adj_earnings, d

	gen hh_pos_adj_earnings = hh_adj_earnings if hh_adj_earnings > 0
  sum hh_pos_adj_earnings, d

	gen hh_earnings_pos = hh_earnings > 0 if !mi(hh_earnings)
  sum hh_earnings_pos, d

  // Exchange rate: 14,403.60 rp/$
  // median monthly salary (assuming 40 hours/day, 5 days/week)
  di `r(p50)' * 4 * 5 * 8
  di `r(p50)' * 4 * 5 * 8 / 14400

* imputed wage (using occupation, field, province, age, and gender)
  summ hourly_wage
  tab occupation
  tab business_field
  tab kode_prov
  summ age_sak
  tab female
  reg hourly_wage i.occupation i.business_field i.kode_prov i.age_sak female, vce(robust)
  predict wage_impute if hourly_wage != .
  summ wage_impute hourly_wage
  corr wage_impute hourly_wage

* change in wage from before covid?
  tab r14b, m
  gen covid_wage_loss = r14b == 2

  * start year
  gen year_tenure = r15a_th
  replace year_tenure = . if year_tenure == 0 | year_tenure == 9999
  tab year_tenure if employed == 1, m

  * start month
  gen month_tenure = r15a_bln
  replace month_tenure = . if month_tenure == 0
  tab month_tenure if employed == 1, m

  // set to middle of year if month missing but year is not
  replace month_tenure = 6 if month_tenure == 99 & year_tenure != .
  replace month_tenure = . if month_tenure == 99 & year_tenure == .
  tab month_tenure if employed == 1, m

  * create start date
  gen start_date = ym(year_tenure, month_tenure)
  format start_date %tm
  summ start_date, format
  count if start_date == . & employed == 1

  * create tenure in months
  summ start_date, format
  count if start_date < tm(2020m8)
  gen tenure_months = tm(2020m8) - start_date
  summ tenure_months, d
  di `r(max)'/12
  count if tenure_months == . & employed == 1
  hist tenure_months

  * check
  gen age_months = 12 * age_sak
  // subtract 48 months to get number of months one has been at least 4 years old (approximately)
  replace age_months = age_months - 48
  // check that respondents do not report being at same job since from before they were 4 years old
  summ age_months
  cap noi assert tenure_months < age_months if tenure_months != .
  rename tenure_months tenure_months_raw
  gen tenure_months = tenure_months_raw
  replace tenure_months = . if tenure_months >= age_months
  drop year_tenure month_tenure age_months
  summ tenure_months*
  tab tenure_months if employed == 1, m

* use internet at work
  tab r17a1, m
  // make missing if not working
  gen use_internet_work = r17b == 1 if r17b != 0
  replace use_internet_work = 0 if employed == 0 & mi(use_internet_work)
  tab use_internet_work

* had more than one job
  tab r27a, m
  gen multiple_jobs = r27a == 1
  tab multiple_jobs, m

* job search/business creation
  tab r29a, m
  gen looking_job = r29a == 1

  tab r29b, m
  gen prep_new_bus = r29b == 1

  gen activities_job = inlist(1, r31a, r31b, r31c, r31d, r31e, r31f)
  label var activities_job "Any effort to find a job"

  * Note: 1 = yes, 2 = no, 0 = not asked
  gen reg_job_mkt = r31a == 1
  gen contact_company = r31b == 1
  gen advert_online = r31c == 1
  gen search_network = r31d == 1
  gen cap_loc_lic = r31e == 1
  gen any_attempt = r31f == 1

  tab employed, m
  gen labor_force = employed == 1 | looking_job == 1 | prep_new_bus == 1
  tab labor_force, m

* anyone in HH own business with employees?
  gen with_emp = inlist(employment_status, 2,3)

*bys hh_id: egen with_emp_HH = max(with_emp)
  bys hh_id_s: egen with_emp_HH_S = max(with_emp)

* Actively starting a new business (22B) --this we can do for anyone in household
	bys hh_id_s: egen prep_bus_hh = max(prep_new_bus)
	bys hh_id_s: egen hh_emp_self = max(self_employed)

/*----------------------------------------------------*/
            /* Section: Kartu Prakerja */
/*----------------------------------------------------*/

  * do you know of kartu prakerja
  tab r44a, m
  gen prakerja_familiar = r44a == 1

  * did you sign up for kartu prakerja?
  tab r44b, m
  gen report_applied = r44b == 1
  tab report_applied

  * main reason signed up for kartu prakerja
  tab r44d, m
  gen reason_prakerja = r44d if r44d != 0
  la def reasons 1 "Skills" 2 "Incentives" 3 "Fill spare time" 4 "Join friends/try repeatedly"  5 "Free registration" 6 "Other", replace
  la val reason_prakerja reasons
  tab reason_prakerja

  * selected for kartu prakerja
  tab r44e, m
  gen report_selected = r44e == 1
  tab report_applied report_selected

  * completed kartu prakerja training
  tab r44f, m
  gen complete_pk_train = r44f == 1
  tab complete_pk_train, m

  * kartu prakerja improved skills
  tab r44g, m
  gen pk_improve_skill = r44g == 1
  tab pk_improve_skill, m

  * what are KP benefits spent on?
  tab r44h1, m
  gen use_pk_aid_needs = r44h1 == 1 if r44h1 != 0

  tab r44h2, m
  gen use_pk_aid_business = r44h2 == 1 if r44h2 != 0

  tab r44h3, m
  gen use_pk_aid_debt = r44h3 == 1 if r44h3 != 0

  tab r44h4, m
  gen use_pk_aid_save = r44h4 == 1 if r44h4 != 0

  tab r44h5, m
  gen use_pk_aid_other = r44h5 == 1 if r44h5 != 0
  summ use_pk_aid*

* new job since the date you registered for Prakerja (since batch 1 will be fine) (we know date you started your job in 15a -- note this is only for main job)
* batch 1 starts 11 April 2020
  gen new_job = start_date - tm(2020m4) >= 0 if !mi(start_date)
  replace new_job = 0 if business_owner == 1 | employed == 0
  tab new_job, m

* new business since start of batch
  gen new_start = start_date - tm(2020m4) >= 0 if !mi(start_date)
  replace new_start = 0 if business_owner != 1 | employed == 0
  tab new_start, m

* Note, missing r43_4 and r43_5
  gen prog_subs = r43_1 == 1
  gen prog_food_assist = r43_2 == 1
  gen prog_direct_cash = r43_3 == 1
  gen prog_other = r43_6 == 1

******************************************************
* check duplicates
	duplicates tag anon_id4, gen(anon_id4_dupe)
	tab anon_id4_dupe

* drop duplicates 
	drop if anon_id4_dupe != 0 & !missing(anon_id4)
	drop anon_id4_dupe

/*----------------------------------------------------*/
/* Section: Export Cleaned Data with Baseline controls */
/*----------------------------------------------------*/

* save
  datasignature 
  if "`r(datasignature)'" == "777581:326(113756):2881593687:2393004404" {
    save "$KP_deid_sakernas/Clean/sak_aug21_deid_clean.dta", replace
      }
  else {
    di as err "Careful, your machine produces a different dataset"
    stop
		}
  


/*----------------------------------------------------*/
            * Merge with admin data and export
/*----------------------------------------------------*/

  keep if !mi(userid_bps) | !mi(anon_id4)

  drop r? r?? r??? r???? k? klas jart* k5_* r6* r7* r8* r10* r12* ///
    r14* r15* r16* r17* r18 r19 r20 r21* r27* r30_* r34* r35* r39* r40* r41* kbji* date_incentive

  gen sak_round = 7

* merge
  preserve
	keep if !missing(anon_id4)
	tempfile sak_anon_id
	save `sak_anon_id'
  restore

  use "$KP_deid_admin/Clean/pmo_b1-22_clean_long_deid.dta", clear

  merge m:1 anon_id4 using `sak_anon_id', nogen keep(3)


	compress
  datasignature 
  if "`r(datasignature)'" == "192944:213(36228):613410903:630630513" {
    save "$KP_deid_sakernas/Clean/sak_aug21_deid_clean_merged.dta", replace
      }
  else {
    di as err "Careful, your machine produces a different dataset"
    stop
		}
	

  // done
